import json, os, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rc
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler, minmax_scale, scale, maxabs_scale
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_curve
from sklearn.metrics import recall_score, precision_score, confusion_matrix, roc_auc_score
from tqdm.auto import tqdm as tn
import joblib
import sys
import json
import time
sys.path.append('../')
import pulicFuncs as pf
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
rc('mathtext', default='regular')
mpl.font_manager.fontManager.addfont(r"./wqy-microhei.ttc")
# mpl.font_manager.fontManager.ttflist 获得临时注册字体的名字
mpl.rcParams['axes.unicode_minus'] = False #负号正常显示
mpl.rcParams['font.sans-serif'] = ['WenQuanYi Micro Hei']
myfont = mpl.font_manager.FontProperties(
fname=r"wqy-microhei.ttc")
pd.set_option('display.max_columns', None) # pandas 完全显示列
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, Lasso, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier
def draw_feature_importance(df, importance_name='fscore', top_N=10):
nn = df.shape[0]
df.plot(kind='bar', x='features', y=importance_name,
legend=False, figsize=(nn * 0.3, 10))
plt.vlines(x=top_N + 0.5, ymin=0,
ymax=df[importance_name][0], linestyles='--', color='black')
plt.text(top_N, df[importance_name][0] - 0.002,
f'TOP{top_N}', ha='right')
plt.title('XGBoost Feature Importance', fontproperties=myfont)
plt.xticks(ticks=range(nn), labels=df['features'].tolist(),
rotation=90, fontproperties=myfont)
plt.xlabel('features', fontproperties=myfont)
plt.ylabel('relative importance', fontproperties=myfont)
plt.show()
all_data = pd.read_csv('./data/20221027数据/1819训练数据.csv')
all_testSet = pd.read_csv('./data/20221027数据/20年测试数据.csv')
all_data.shape
(3463, 413)
all_data = all_data.fillna(-9999999)
all_testSet = all_testSet.fillna(-9999999)
def ww_get_value_counts_and_rate(df, colname):
"""
获得对应列的所有值个数与相应占比
"""
tmp = pd.DataFrame()
tmp[f'{colname}_num'] = df[colname].value_counts()
tmp[f'{colname}_rate'] = df[colname].value_counts() / df.shape[0]
print(tmp)
ww_get_value_counts_and_rate(all_data, 'target') # 统计一下标签每个取值的数量
target_num target_rate 0 2506 0.72365 1 957 0.27635
df = all_data[all_data['target'] != -9999999].copy()
print(df.shape)
target_dist = df['target'].value_counts().sort_index()
good_bad_rate = target_dist[0] / target_dist[1] # 0是好客户,1是坏客户
print(f'好坏比:{good_bad_rate}')
(3463, 413) 好坏比:2.618599791013584
# index = list(range(len(df))) # 1~3182,对于行号
# np.random.shuffle(index)
# index_df = pd.DataFrame(index, columns=['index']) # 生成一个df,只有一列,是乱序的行号
# index_df.to_csv(f'./20221027数据/index_ALL.csv', index=False) # index=False 不会把df的行index写入csv
index_df = pd.read_csv(f'./data/20221027数据/index_ALL.csv') # 打乱的顺序保存在csv后再读取,便于复现
index = index_df['index'].values.tolist() # index 是行索引乱序列表
data_df = df.iloc[index[:], :].copy() # 获得行乱序的训练数据
testSet = all_testSet.copy() # 测试数据不打乱
data_df.shape, testSet.shape
((3463, 413), (2792, 413))
ww_get_value_counts_and_rate(data_df, 'target')
ww_get_value_counts_and_rate(testSet, 'target')
target_num target_rate 0 2506 0.72365 1 957 0.27635 target_num target_rate 0 2155 0.771848 1 637 0.228152
# 业务关注的特征列表
care = ['区域经济_生产总值(GDP)(元)', '区域经济_生产总值同比增长(%)', '区域财政_财政收入(元)',
'区域财政_一般预算收入增长', '区域财政_财政自给率(%)', '区域财政_一般预算收入占比(%)',
'区域财政_基金收入占比(%)', '区域财政_政府负债率(%)', '区域财政_地方政府债务率(%)',
'区域利差_1年期AAA', '区域利差_1年期AA+', '区域利差_1年期AA', '区域利差_1年期不限评级',
'区域利差_2年期AAA', '区域利差_2年期AA+', '区域利差_2年期AA', '区域利差_2年期不限评级',
'区域利差_3年期AAA', '区域利差_3年期AA+', '区域利差_3年期AA', '区域利差_3年期不限评级',
'区域利差_4年期AAA', '区域利差_4年期AA+', '区域利差_4年期AA', '区域利差_4年期不限评级',
'区域贷款_区域银行贷款余额(亿元)']
null_rate = 0.85
null_rate_df = pf.get_null_rate_df(data_df.iloc[:, 8:].replace(-9999999, np.nan))
Nnll_features = null_rate_df[null_rate_df['null_rate'] < null_rate]['features'].tolist() # 获得小于指定缺失值的特征列表
null_ratio = len(Nnll_features) / len(df.columns[8:])
print(len(Nnll_features), null_ratio)
383 0.945679012345679
null_rate_df[null_rate_df['features'].isin(care)]
| features | null_rate | |
|---|---|---|
| 27 | 区域利差_3年期AAA | 0.696795 |
| 30 | 区域利差_2年期AAA | 0.621138 |
| 31 | 区域利差_1年期AAA | 0.616806 |
| 33 | 区域利差_4年期AAA | 0.581288 |
| 34 | 区域利差_3年期AA | 0.557031 |
| 51 | 区域利差_3年期AA+ | 0.417846 |
| 58 | 区域利差_2年期AA+ | 0.350563 |
| 59 | 区域利差_1年期AA+ | 0.341611 |
| 67 | 区域利差_2年期AA | 0.324285 |
| 68 | 区域利差_1年期AA | 0.320243 |
| 74 | 区域利差_4年期AA+ | 0.274617 |
| 75 | 区域利差_3年期不限评级 | 0.272596 |
| 95 | 区域利差_2年期不限评级 | 0.199249 |
| 96 | 区域利差_4年期AA | 0.199249 |
| 97 | 区域利差_1年期不限评级 | 0.193474 |
| 133 | 区域利差_4年期不限评级 | 0.112619 |
| 363 | 区域财政_基金收入占比(%) | 0.033786 |
| 370 | 区域财政_一般预算收入增长 | 0.020791 |
| 372 | 区域贷款_区域银行贷款余额(亿元) | 0.012995 |
| 386 | 区域财政_地方政府债务率(%) | 0.000866 |
| 390 | 区域财政_政府负债率(%) | 0.000866 |
| 392 | 区域经济_生产总值同比增长(%) | 0.000289 |
| 393 | 区域经济_生产总值(GDP)(元) | 0.000000 |
| 394 | 区域财政_一般预算收入占比(%) | 0.000000 |
| 397 | 区域财政_财政自给率(%) | 0.000000 |
| 402 | 区域财政_财政收入(元) | 0.000000 |
pf.draw_null_rate_distribution(
data_df.iloc[:, 8:], axis=0, ascending=False, n_top=20, null_rate=null_rate)
# 缺失率小于85%的特征的占比为95%,绘制了缺失率前20的特征
## 每个特征按照年度进行缺失率统计
N = 8
null_rate = 0.5
tmp = pd.concat([data_df, testSet], axis=0)
tmp.iloc[:, N:] = tmp.iloc[:, N:].replace(-9999999, np.nan).isnull() # 对8个后面的特征字段,转换为是否为空,便于统计
tmp_null_df = tmp.groupby(['year']).sum() / tmp.groupby(['year']).count() # sum 对于 false 为0
feature_null_df = pd.DataFrame(
(tmp_null_df.iloc[:, N:] < null_rate).sum(), columns=['符合要求的年数']) # 统计出每个特征,三年内每年小于指定缺失率的次数
Nnll_features = feature_null_df[feature_null_df['符合要求的年数'] == 3].index.tolist()
print(len(Nnll_features))
352
set(care) - set(Nnll_features) # 比较关注的特征中,不满足三年缺失率的特征
{'区域利差_1年期AAA', '区域利差_2年期AAA', '区域利差_3年期AA', '区域利差_3年期AAA', '区域利差_4年期AAA'}
value_features = data_df.columns[8:].tolist()
X_training, X_validation, y_training, y_validation = train_test_split(
data_df.loc[:, value_features],
data_df.loc[:, 'target'],
test_size=0.3,
random_state=0,
stratify=data_df.loc[:, 'target'] # 控制训练集和验证集中target的分布
)
X_testing = testSet.loc[:, value_features].copy()
y_testing = testSet['target']
X_training.shape[0], X_validation.shape[0], X_testing.shape[0]
(2424, 1039, 2792)
print(y_training.value_counts())
print(y_validation.value_counts()) # 很符合之前的分布
0 1754 1 670 Name: target, dtype: int64 0 752 1 287 Name: target, dtype: int64
tmp = pd.concat([data_df, testSet], axis=0)
iv_df, woe_df = pf.iv_woe(
tmp[['target'] + care], # 查看业务老师关注的特征的iv与woe
'target', bins=20
)
iv_woe: 0%| | 0/26 [00:00<?, ?it/s]
iv_df
| features | iv | |
|---|---|---|
| 0 | 区域财政_财政收入(元) | 1.231433 |
| 0 | 区域财政_财政自给率(%) | 1.054307 |
| 0 | 区域经济_生产总值(GDP)(元) | 1.052078 |
| 0 | 区域利差_4年期不限评级 | 1.017426 |
| 0 | 区域贷款_区域银行贷款余额(亿元) | 0.909544 |
| 0 | 区域利差_2年期不限评级 | 0.746671 |
| 0 | 区域利差_1年期不限评级 | 0.724916 |
| 0 | 区域利差_4年期AA+ | 0.662673 |
| 0 | 区域财政_一般预算收入增长 | 0.659968 |
| 0 | 区域利差_3年期不限评级 | 0.655208 |
| 0 | 区域利差_4年期AAA | 0.615490 |
| 0 | 区域利差_4年期AA | 0.597348 |
| 0 | 区域利差_3年期AA+ | 0.591655 |
| 0 | 区域财政_一般预算收入占比(%) | 0.567629 |
| 0 | 区域利差_3年期AAA | 0.563559 |
| 0 | 区域财政_基金收入占比(%) | 0.537443 |
| 0 | 区域利差_1年期AA+ | 0.528846 |
| 0 | 区域利差_1年期AA | 0.518764 |
| 0 | 区域财政_政府负债率(%) | 0.512509 |
| 0 | 区域利差_2年期AAA | 0.510014 |
| 0 | 区域利差_2年期AA+ | 0.502329 |
| 0 | 区域利差_1年期AAA | 0.444882 |
| 0 | 区域利差_2年期AA | 0.437265 |
| 0 | 区域财政_地方政府债务率(%) | 0.359108 |
| 0 | 区域利差_3年期AA | 0.358700 |
| 0 | 区域经济_生产总值同比增长(%) | 0.206396 |
woe_df
| features | Cutoff | N | Events | % of Events | Non_Events | % of Non_Events | woe | iv | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 区域财政_财政收入(元) | (-9999999.001, 28085150000.0] | 325 | 147 | 0.092221 | 178 | 0.038189 | 0.881617 | 0.047636 |
| 1 | 区域财政_财政收入(元) | (28085150000.0, 36280690000.0] | 302 | 170 | 0.106650 | 132 | 0.028320 | 1.325954 | 0.103863 |
| 2 | 区域财政_财政收入(元) | (36280690000.0, 43923306000.0] | 312 | 149 | 0.093476 | 163 | 0.034971 | 0.983162 | 0.057520 |
| 3 | 区域财政_财政收入(元) | (43923306000.0, 54301870000.0] | 324 | 142 | 0.089084 | 182 | 0.039047 | 0.824789 | 0.041271 |
| 4 | 区域财政_财政收入(元) | (54301870000.0, 61263140000.0] | 306 | 135 | 0.084693 | 171 | 0.036687 | 0.836579 | 0.040161 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 412 | 区域经济_生产总值同比增长(%) | (7.4, 7.7] | 296 | 95 | 0.059598 | 201 | 0.043124 | 0.323549 | 0.005331 |
| 413 | 区域经济_生产总值同比增长(%) | (7.7, 7.9] | 337 | 110 | 0.069009 | 227 | 0.048702 | 0.348508 | 0.007077 |
| 414 | 区域经济_生产总值同比增长(%) | (7.9, 8.1] | 371 | 114 | 0.071518 | 257 | 0.055138 | 0.260102 | 0.004261 |
| 415 | 区域经济_生产总值同比增长(%) | (8.1, 8.5] | 243 | 77 | 0.048306 | 166 | 0.035615 | 0.304794 | 0.003869 |
| 416 | 区域经济_生产总值同比增长(%) | (8.5, 12.0] | 297 | 142 | 0.089084 | 155 | 0.033255 | 0.985367 | 0.055013 |
417 rows × 9 columns
tmp = pd.concat([data_df, testSet], axis=0)
iv_df, woe_df = pf.iv_woe(
tmp[['target'] + Nnll_features], # Nnll_features 是通过每年缺失率选出来的
'target', bins=20
)
iv_woe: 0%| | 0/352 [00:00<?, ?it/s]
selected_features = iv_df[iv_df['iv']>0.1]['features'].tolist()
len(selected_features)
208
params = {
'learning_rate': 0.1,
'random_state': 47,
'max_depth': 3,
'min_child_weight': 20,
'subsample': 0.7,
'colsample_bytree': 0.7,
'gamma': 0,
'objective': 'binary:logistic',
'scale_pos_weight': good_bad_rate,
'nthread': 14,
'silent': 1,
}
tmp = pd.concat([data_df, testSet], axis=0)
feature_importance_df = pf.get_feature_importance_from_xgboost(
# 用Xgboost进行特征筛选,并从xgboost获取feature importance
X_training[Nnll_features], # 训练集只取部分
y_training,
rounds=300, # 迭代轮数
params=params # XGB的参数
)
[13:37:36] WARNING: ../src/learner.cc:576:
Parameters: { "silent" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
[13:37:36] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
('feature number:', 316)
feature_importance_df
| features | fscore | |
|---|---|---|
| 0 | 资产规模数据_资产总计 | 0.015593 |
| 1 | 财报衍生_其他应收款/总资产 | 0.015593 |
| 2 | 成长能力_营业利润(N年,增长率) | 0.011525 |
| 3 | 区域利差_4年期AA | 0.011525 |
| 4 | 成长能力_每股经营活动产生的现金流量净额(同比增长率) | 0.010847 |
| ... | ... | ... |
| 311 | 盈利能力_财务费用/营业总收入(TTM) | 0.000678 |
| 312 | 盈利能力_营业总成本/营业总收入(TTM) | 0.000678 |
| 313 | 盈利能力_成本费用利润率 | 0.000678 |
| 314 | 盈利能力_息税前利润/营业总收入 | 0.000678 |
| 315 | 偿债能力_产权比率 | 0.000678 |
316 rows × 2 columns
# 先用特征重要性筛特征,再用iv筛特征
# importance_features = feature_importance_df['features'].tolist()[:N_features]
# iv_df, woe_df = pf.iv_woe(
# data_df[['target']+importance_features],
# 'target', bins=20
# )
# iv_df = iv_df.rename(columns={'iv': 'IV'})
# iv_df['feature_class'] = iv_df['features'].map(lambda x: x.split('_')[0])
# iv_df.index = iv_df['features'].tolist()
# iv_df[:50]
draw_feature_importance(feature_importance_df[:60], 'fscore', 30)
selected_features = feature_importance_df['features'].tolist()[:60]
# 查看选出特征的相关性
uncorr_features = pf.get_uncorr_features(
data_df[['target']+selected_features],
corr_rate=0.95, target_name='target')
print(len(uncorr_features))
40
# pf.draw_corr_heatmap(data_df[['target']+uncorr_features], target_name='target')
# 通过相关性进一步筛选特征
uncorr_feature_df = feature_importance_df[
feature_importance_df['features'].isin(uncorr_features)
].sort_values('fscore', ascending=False)
selected_features = uncorr_feature_df['features'].tolist()[:30]
selected_features
['资产规模数据_资产总计', '财报衍生_其他应收款/总资产', '成长能力_营业利润(N年,增长率)', '区域利差_4年期AA', '成长能力_每股经营活动产生的现金流量净额(同比增长率)', '区域财政_政府负债率(%)', '营运能力_固定资产周转率', '盈利能力_EBITDA/营业总收入', '区域利差_4年期不限评级', '区域财政_税收返还收入(元)', '区域利差_2年期不限评级', '资本结构_长期资产适合率', '营运能力_应收账款周转天数', '偿债能力_货币资金/短期债务', '区域经济_人均GDP(元)', '财报衍生_净利润', '财报衍生_流动负债增长率', '财报衍生_有形净资产', '财报衍生_在建工程', '财报衍生_固定资产', '现金流量_投资活动产生的现金流量净额占比', '区域利差_2年期AA', '财报衍生_净资产(即所有者权益)', '区域财政_专项债券付息支出(元)', '成长能力_价值变动净收益(N年,增长率)', '财报衍生_预收帐款周转天数', '财报衍生_资金回笼率', '财报衍生_有形总资产', '区域财政_存续城投债务规模/一般预算收入(%)', '区域财政_国有资本经营收入(元)']
# 基于业务需要,指定了入模特征
last_selected_features = ['资产规模数据_资产总计', '成交_成交土地均价(元/㎡)',
'财报衍生_净资产(即所有者权益)', '区域财政_政府负债率(%)',
'区域财政_财政自给率(%)', '财报衍生_流动资产',
'区域财政_存续城投债务规模/一般预算收入(%)', '财报衍生_固定资产',
'偿债能力_货币资金/短期债务', '财报衍生_净利润',
'区域经济_生产总值(GDP)(元)',
'区域贷款_区域银行贷款余额(亿元)',
'区域财政_一般预算收入占比(%)',
'区域利差_1年期不限评级',
'区域财政_地方政府债务率(%)', '区域财政_一般预算收入增长',
'偿债能力_现金比率', '财报衍生_其他应付款/流动负债',
#'财报衍生_资金回笼率', # 21年数据没这个特征
'每股指标_每股现金流量净额',
'区域财政_基金收入占比(%)', '财报衍生_其他应收款/总资产',
'偿债能力_有形净值债务率', '财报衍生_无形资产',
#'财报衍生_流动负债增长率',
'财报衍生_存货周转天数',
'区域经济_生产总值同比增长(%)', '财报衍生_其他应付款/流动资产',
'成长能力_净利润复合年增长率', '区域财政_财政收入(元)']
# 由于21年数据有两个特征没有,于是剔除,从备选列表中找两个特征补充
# tmp = list(set(importance_features[:(30)]) - set(last_selected_features))
# beixuan = ['财报衍生_预付账款周转天数',
# '财报衍生_营业利润+管理费用+财务费用', '财报衍生_应收账款周转率', '财报衍生_净资产同比增长率',
# '财报衍生_净资产(即所有者权益)', '财报衍生_其他应收款/总资产', '财报衍生_其他应付款/流动资产',
# '财报衍生_其他应付款/流动负债', '财报衍生_两金占流动资产比重', '财报衍生_流动资产', '财报衍生_固定资产',
# '财报衍生_净利润', '资产规模数据_资产总计', '盈利能力_资产减值损失/营业总收入(TTM)', '偿债能力_货币资金/短期债务',
# '成长能力_营业收入(同比增长率)', '成长能力_营业利润(N年,增长率)', '每股指标_每股现金流量净额',
# '成长能力_每股净资产(相对年初增长率)', '成长能力_归属母公司股东的权益(相对年初增长率)', '营运能力_应收账款周转天数',
# '营运能力_应付账款周转天数',
# '偿债能力_现金比率',
# '财报衍生_存货周转天数',
# '成长能力_净利润复合年增长率',
# '偿债能力_有形净值债务率',
# '财报衍生_无形资产'
# '成交_土地宗数(宗)', '成交_土地出让金(万元)', '成交_建设用地面积(万㎡)',
# '成交_规划建筑面积(万㎡)', '成交_平均溢价率(%)', '成交_成交楼面均价(元/㎡)', '成交_成交土地均价(元/㎡)',
# '流拍_土地宗数(宗)', '流拍_建设用地面积(万㎡)', '流拍_规划建筑面积(万㎡)', '流拍_起始价(万元)',
# '流拍_流拍楼面均价(元/㎡)', '流拍_流拍土地均价(元/㎡)', '流拍率(流拍/成交)', '流拍率(流拍/(成交+流拍))']
# [i for i in tmp if i in beixuan]
selected_features = last_selected_features + ['成长能力_营业利润(N年,增长率)', '成长能力_归属母公司股东的权益(相对年初增长率)']
len(selected_features)
30
print(X_training.shape, X_validation.shape, X_testing.shape)
print(y_training.shape, y_validation.shape, y_testing.shape)
(2424, 405) (1039, 405) (2792, 405) (2424,) (1039,) (2792,)
X_training_selected = X_training[selected_features]
X_validation_selected = X_validation[selected_features]
# model = XGBClassifier(
# learning_rate=0.01,
# random_state=47,
# n_estimators=500,
# max_depth=4,
# min_child_weight=20,
# subsample=0.8,
# colsample_bytree=0.75,
# gamma=0,
# objective='binary:logistic',
# nthread=14,
# scale_pos_weight=good_bad_rate,
# )
# model = RandomForestClassifier(
# n_estimators=100,
# max_depth=4,
# class_weight='balanced',
# random_state=47
# )
model = CatBoostClassifier(
iterations=300,
depth=4,
learning_rate=0.01,
l2_leaf_reg=3,
# loss_function='CrossEntropy',
verbose=False,
scale_pos_weight=good_bad_rate,
random_seed = 0,
)
# model = CatBoostClassifier(
# iterations=70,
# depth=4,
# learning_rate=0.08,
# l2_leaf_reg=3.0,
# # loss_function='CrossEntropy',
# verbose=False,
# scale_pos_weight=good_bad_rate,
# random_seed=0,
# )
model.fit(X_training_selected, y_training)
<catboost.core.CatBoostClassifier at 0x7f122d6bf890>
# 计算模型分
input_data={
'Training': {
'X' : X_training_selected,
'y' : y_training,
},
'Validation': {
'X' : X_validation_selected,
'y' : y_validation,
},
'Testing': {
'X' : X_testing[selected_features],
'y' : y_testing,
},
}
score_df = pf.print_model_scores(model, input_data, is_print=False)
score_df
| Training | Validation | Testing | Validation - Training | Testing - Training | Testing - Validation | |
|---|---|---|---|---|---|---|
| Accuracy | 0.776815 | 0.769971 | 0.746777 | -0.006844 | -0.030039 | -0.023195 |
| Precision | 0.565615 | 0.558252 | 0.467652 | -0.007363 | -0.097963 | -0.090600 |
| Recall | 0.829851 | 0.801394 | 0.794349 | -0.028457 | -0.035502 | -0.007045 |
| F1 | 0.672716 | 0.658083 | 0.588714 | -0.014633 | -0.084002 | -0.069369 |
| AUC | 0.881921 | 0.849454 | 0.837653 | -0.032467 | -0.044268 | -0.011801 |
| KS | 0.589898 | 0.571795 | 0.531200 | -0.018103 | -0.058698 | -0.040594 |
# pkl_path = './20221027数据/GBDT_城投中债隐含评级.pkl'
# model = joblib.load(pkl_path)
# # model.get_all_params()
# 使用测试集
X_merged = X_testing[selected_features]
y_merged = y_testing
target0 = model.predict_proba(X_merged[y_merged == 0]) # 0是好客户,1是坏客户
target1 = model.predict_proba(X_merged[y_merged == 1])
all_probs = {
'good': target0[:, 1], # 取预测概率的第1列,即预测为1的概率,即坏客户的概率
'bad': target1[:, 1],
}
# 所以理论上,概率越大,就越多坏客户
ddf = pf.draw_proba_distribution(all_probs, step=0.1, is_return=True, fig_save=False)
# ddf.to_excel('./ddf.xlsx')
ddf # 等宽
| good | bad | ALL | rate_good | rate_bad | |
|---|---|---|---|---|---|
| 0.0-0.1 | 296 | 0 | 296 | 1.000000 | 0.000000 |
| 0.1-0.2 | 406 | 6 | 412 | 0.985437 | 0.014563 |
| 0.2-0.3 | 356 | 27 | 383 | 0.929504 | 0.070496 |
| 0.3-0.4 | 291 | 54 | 345 | 0.843478 | 0.156522 |
| 0.4-0.5 | 230 | 44 | 274 | 0.839416 | 0.160584 |
| 0.5-0.6 | 188 | 76 | 264 | 0.712121 | 0.287879 |
| 0.6-0.7 | 174 | 126 | 300 | 0.580000 | 0.420000 |
| 0.7-0.8 | 140 | 169 | 309 | 0.453074 | 0.546926 |
| 0.8-0.9 | 73 | 130 | 203 | 0.359606 | 0.640394 |
| 0.9-1.0 | 1 | 5 | 6 | 0.166667 | 0.833333 |
## 通过观察排序性和模型分调参,网格搜索
# search_params = {
# 'iterations': range(400, 1001, 200),
# 'depth': [4],
# 'learning_rate': [0.1, 0.05, 0.01, 0.02],
# 'l2_leaf_reg': [0, 7, 10, 20, 50, 100],
# 'scale_pos_weight': [good_bad_rate]
# }
# times = 1
# for k, v in search_params.items():
# times *= len(v)
# print(times)
# model = CatBoostClassifier(
# iterations=1000,
# depth=4,
# learning_rate=0.01,
# l2_leaf_reg=7,
# verbose=False,
# scale_pos_weight=good_bad_rate,
# )
# model = pf.k_fold_model_with_param_search(
# model=model,
# X=X_training_selected,
# y=y_training,
# search_params=search_params,
# score_fnc='auc',
# K=5,
# verbose=1
# )
model.feature_importances_
array([17.59289737, 18.43422405, 6.18989722, 7.72593761, 4.78080144,
2.21646645, 1.51962489, 3.14235894, 1.57769111, 2.22454699,
1.61439207, 2.9287834 , 2.8568244 , 2.87556112, 3.53823908,
0.97630477, 2.02346584, 2.59547113, 0.86194183, 1.77709818,
4.87046848, 1.30881883, 0.75369305, 0.57469032, 0.33424009,
1.04489914, 0.54679083, 1.43597084, 0.49217855, 1.18572198])
# 将重要性转换为百分比
model_feature_importance = pd.DataFrame()
model_feature_importance['importance'] = model.feature_importances_ / sum(
model.feature_importances_)
model_feature_importance['features'] = selected_features
model_feature_importance.sort_values('importance', ascending=False, inplace=True)
model_feature_importance.index = range(model_feature_importance.shape[0])
draw_feature_importance(model_feature_importance, 'importance', 10)
model_feature_importance
| importance | features | |
|---|---|---|
| 0 | 0.184342 | 成交_成交土地均价(元/㎡) |
| 1 | 0.175929 | 资产规模数据_资产总计 |
| 2 | 0.077259 | 区域财政_政府负债率(%) |
| 3 | 0.061899 | 财报衍生_净资产(即所有者权益) |
| 4 | 0.048705 | 财报衍生_其他应收款/总资产 |
| 5 | 0.047808 | 区域财政_财政自给率(%) |
| 6 | 0.035382 | 区域财政_地方政府债务率(%) |
| 7 | 0.031424 | 财报衍生_固定资产 |
| 8 | 0.029288 | 区域贷款_区域银行贷款余额(亿元) |
| 9 | 0.028756 | 区域利差_1年期不限评级 |
| 10 | 0.028568 | 区域财政_一般预算收入占比(%) |
| 11 | 0.025955 | 财报衍生_其他应付款/流动负债 |
| 12 | 0.022245 | 财报衍生_净利润 |
| 13 | 0.022165 | 财报衍生_流动资产 |
| 14 | 0.020235 | 偿债能力_现金比率 |
| 15 | 0.017771 | 区域财政_基金收入占比(%) |
| 16 | 0.016144 | 区域经济_生产总值(GDP)(元) |
| 17 | 0.015777 | 偿债能力_货币资金/短期债务 |
| 18 | 0.015196 | 区域财政_存续城投债务规模/一般预算收入(%) |
| 19 | 0.014360 | 区域财政_财政收入(元) |
| 20 | 0.013088 | 偿债能力_有形净值债务率 |
| 21 | 0.011857 | 成长能力_归属母公司股东的权益(相对年初增长率) |
| 22 | 0.010449 | 财报衍生_其他应付款/流动资产 |
| 23 | 0.009763 | 区域财政_一般预算收入增长 |
| 24 | 0.008619 | 每股指标_每股现金流量净额 |
| 25 | 0.007537 | 财报衍生_无形资产 |
| 26 | 0.005747 | 财报衍生_存货周转天数 |
| 27 | 0.005468 | 成长能力_净利润复合年增长率 |
| 28 | 0.004922 | 成长能力_营业利润(N年,增长率) |
| 29 | 0.003342 | 区域经济_生产总值同比增长(%) |
# model_feature_importance.to_csv('./models/v1/model_feature_importance.csv', index=False)
X_merged = X_testing[selected_features]
y_merged = y_testing
target0 = model.predict_proba(X_merged[y_merged == 0])
target1 = model.predict_proba(X_merged[y_merged == 1])
all_probs = {
'good': target0[:, 1], # 取预测概率的第1列,即预测为1的概率,即坏客户的概率
'bad': target1[:, 1],
}
# 所以理论上,概率越大,就越多坏客户
ddf = pf.draw_proba_distribution(all_probs, step=0.05, is_return=True, fig_save=False)
# ddf
# ddf.to_excel('./ddf.xlsx')
row=1
target0[:, row].min(), target1[:, row].min(), target0[:, row].max(), target1[:, row].max()
(0.021938861706956424, 0.13981124735876804, 0.9049597121693593, 0.9059573864639114)
y_testing.value_counts()
0 2155 1 637 Name: target, dtype: int64
ddf
| good | bad | ALL | rate_good | rate_bad | |
|---|---|---|---|---|---|
| 0.0-0.05 | 97 | 0 | 97 | 1.000000 | 0.000000 |
| 0.05-0.1 | 199 | 0 | 199 | 1.000000 | 0.000000 |
| 0.1-0.15 | 235 | 2 | 237 | 0.991561 | 0.008439 |
| 0.15-0.2 | 171 | 4 | 175 | 0.977143 | 0.022857 |
| 0.2-0.25 | 184 | 9 | 193 | 0.953368 | 0.046632 |
| 0.25-0.3 | 172 | 18 | 190 | 0.905263 | 0.094737 |
| 0.3-0.35 | 164 | 19 | 183 | 0.896175 | 0.103825 |
| 0.35-0.4 | 127 | 35 | 162 | 0.783951 | 0.216049 |
| 0.4-0.45 | 109 | 21 | 130 | 0.838462 | 0.161538 |
| 0.45-0.5 | 121 | 23 | 144 | 0.840278 | 0.159722 |
| 0.5-0.55 | 99 | 29 | 128 | 0.773438 | 0.226562 |
| 0.55-0.6 | 89 | 47 | 136 | 0.654412 | 0.345588 |
| 0.6-0.65 | 84 | 52 | 136 | 0.617647 | 0.382353 |
| 0.65-0.7 | 90 | 74 | 164 | 0.548780 | 0.451220 |
| 0.7-0.75 | 80 | 87 | 167 | 0.479042 | 0.520958 |
| 0.75-0.8 | 60 | 82 | 142 | 0.422535 | 0.577465 |
| 0.8-0.85 | 62 | 84 | 146 | 0.424658 | 0.575342 |
| 0.85-0.9 | 11 | 46 | 57 | 0.192982 | 0.807018 |
| 0.9-0.95 | 1 | 5 | 6 | 0.166667 | 0.833333 |
| 0.95-1.0 | 0 | 0 | 0 | 0.000000 | 0.000000 |
pf.get_ddf_table(ddf).round(4).copy()
| 区间好 | 区间坏 | 区间总人数 | 区间好占比 | 区间坏占比 | 累计总人数 | 累计人数占比 | 累计好占比 | 累计坏占比 | 当前区间坏占比 | KS | 随机 | 提升(同区间) | 提升(累计) | 总体坏比例下降到 | 策略价值 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0-0.05 | 97 | 0 | 97 | 1.0000 | 0.0000 | 97 | 0.0347 | 0.0450 | 0.0000 | 0.0000 | -0.0450 | 0.2282 | -1.0000 | 0.0000 | 0.2282 | NaN |
| 0.05-0.1 | 199 | 0 | 199 | 1.0000 | 0.0000 | 296 | 0.1060 | 0.1374 | 0.0000 | 0.0000 | -0.1374 | 0.2282 | -1.0000 | 0.0000 | 0.2282 | NaN |
| 0.1-0.15 | 235 | 2 | 237 | 0.9916 | 0.0084 | 533 | 0.1909 | 0.2464 | 0.0031 | 0.0038 | -0.2433 | 0.2282 | -0.9836 | 0.0209 | 0.2274 | -0.7692 |
| 0.15-0.2 | 171 | 4 | 175 | 0.9771 | 0.0229 | 708 | 0.2536 | 0.3258 | 0.0094 | 0.0085 | -0.3163 | 0.2282 | -0.9629 | 0.0471 | 0.2260 | -0.9782 |
| 0.2-0.25 | 184 | 9 | 193 | 0.9534 | 0.0466 | 901 | 0.3227 | 0.4111 | 0.0235 | 0.0166 | -0.3876 | 0.2282 | -0.9270 | 0.0942 | 0.2228 | -1.1913 |
| 0.25-0.3 | 172 | 18 | 190 | 0.9053 | 0.0947 | 1091 | 0.3908 | 0.4910 | 0.0518 | 0.0302 | -0.4391 | 0.2282 | -0.8674 | 0.1727 | 0.2163 | -1.3778 |
| 0.3-0.35 | 164 | 19 | 183 | 0.8962 | 0.1038 | 1274 | 0.4563 | 0.5671 | 0.0816 | 0.0408 | -0.4854 | 0.2282 | -0.8211 | 0.2332 | 0.2095 | -1.4033 |
| 0.35-0.4 | 127 | 35 | 162 | 0.7840 | 0.2160 | 1436 | 0.5143 | 0.6260 | 0.1366 | 0.0606 | -0.4894 | 0.2282 | -0.7345 | 0.3414 | 0.1970 | -1.4206 |
| 0.4-0.45 | 109 | 21 | 130 | 0.8385 | 0.1615 | 1566 | 0.5609 | 0.6766 | 0.1695 | 0.0690 | -0.5070 | 0.2282 | -0.6977 | 0.3768 | 0.1895 | -1.3366 |
| 0.45-0.5 | 121 | 23 | 144 | 0.8403 | 0.1597 | 1710 | 0.6125 | 0.7327 | 0.2057 | 0.0766 | -0.5271 | 0.2282 | -0.6642 | 0.4113 | 0.1812 | -1.2612 |
| 0.5-0.55 | 99 | 29 | 128 | 0.7734 | 0.2266 | 1838 | 0.6583 | 0.7787 | 0.2512 | 0.0871 | -0.5275 | 0.2282 | -0.6185 | 0.4567 | 0.1708 | -1.1798 |
| 0.55-0.6 | 89 | 47 | 136 | 0.6544 | 0.3456 | 1974 | 0.7070 | 0.8200 | 0.3250 | 0.1049 | -0.4950 | 0.2282 | -0.5404 | 0.5416 | 0.1540 | -1.0749 |
| 0.6-0.65 | 84 | 52 | 136 | 0.6176 | 0.3824 | 2110 | 0.7557 | 0.8589 | 0.4066 | 0.1227 | -0.4523 | 0.2282 | -0.4620 | 0.6255 | 0.1354 | -0.9488 |
| 0.65-0.7 | 90 | 74 | 164 | 0.5488 | 0.4512 | 2274 | 0.8145 | 0.9007 | 0.5228 | 0.1464 | -0.3779 | 0.2282 | -0.3582 | 0.7468 | 0.1089 | -0.7745 |
| 0.7-0.75 | 80 | 87 | 167 | 0.4790 | 0.5210 | 2441 | 0.8743 | 0.9378 | 0.6593 | 0.1721 | -0.2785 | 0.2282 | -0.2459 | 0.8791 | 0.0777 | -0.5573 |
| 0.75-0.8 | 60 | 82 | 142 | 0.4225 | 0.5775 | 2583 | 0.9251 | 0.9657 | 0.7881 | 0.1943 | -0.1776 | 0.2282 | -0.1482 | 0.9851 | 0.0484 | -0.3442 |
| 0.8-0.85 | 62 | 84 | 146 | 0.4247 | 0.5753 | 2729 | 0.9774 | 0.9944 | 0.9199 | 0.2147 | -0.0745 | 0.2282 | -0.0588 | 1.0823 | 0.0183 | -0.1390 |
| 0.85-0.9 | 11 | 46 | 57 | 0.1930 | 0.8070 | 2786 | 0.9979 | 0.9995 | 0.9922 | 0.2268 | -0.0074 | 0.2282 | -0.0057 | 1.1024 | 0.0018 | -0.0132 |
| 0.9-0.95 | 1 | 5 | 6 | 0.1667 | 0.8333 | 2792 | 1.0000 | 1.0000 | 1.0000 | 0.2282 | 0.0000 | 0.2282 | 0.0000 | 1.0526 | 0.0000 | 0.0000 |
| 0.95-1.0 | 0 | 0 | 0 | 0.0000 | 0.0000 | 2792 | 1.0000 | 1.0000 | 1.0000 | 0.2282 | 0.0000 | 0.2282 | 0.0000 | 1.0000 | 0.0000 | 0.0000 |
def ww_get_cutoff(df, target_col, bins_col, if_plot=True):
"""
传入数据,标签列名,区间列名
"""
cutoff = pd.DataFrame()
cutoff['区间好'] = df[df[target_col] == 0].groupby(bins_col).count()[
target_col].sort_index(ascending=False)
cutoff['区间坏'] = df[df[target_col] == 1].groupby(bins_col).count()[
target_col]
cutoff['区间总人数'] = df.groupby(bins_col).count()[target_col]
cutoff['区间好占比'] = cutoff['区间好'] / cutoff['区间总人数']
cutoff['区间坏占比'] = cutoff['区间坏'] / cutoff['区间总人数']
cutoff['区间人数占比'] = cutoff['区间总人数'] / cutoff['区间总人数'].sum()
cutoff['累计总人数'] = cutoff['区间总人数'].cumsum()
cutoff['累计人数占比'] = cutoff['累计总人数'] / cutoff['区间总人数'].sum()
cutoff['累计好占比'] = cutoff['区间好'].cumsum() / cutoff['区间好'].sum()
cutoff['累计坏占比(recall)'] = cutoff['区间坏'].cumsum() / cutoff['区间坏'].sum()
cutoff['当前区间坏占比(precision)'] = cutoff['区间坏'].cumsum() / \
cutoff['区间总人数'].cumsum()
cutoff['KS'] = cutoff['累计坏占比(recall)'] - cutoff['累计好占比']
cutoff['随机'] = cutoff['区间坏'].sum() / cutoff['区间总人数'].sum()
cutoff['提升'] = cutoff['当前区间坏占比(precision)'] / cutoff['随机'] - 1
cutoff['总体坏比例下降到'] = (
(cutoff['区间坏'].sum() - cutoff['区间坏'].cumsum()) / cutoff['区间总人数'].sum()
)
cutoff = cutoff.reset_index()
ddf = pd.DataFrame()
ddf[['bins', 'good', 'bad', 'ALL', 'rate_good', 'rate_bad']
] = cutoff[['bins', '区间好', '区间坏', '区间总人数', '区间好占比', '区间坏占比']]
if if_plot:
plt.figure(figsize=(10, 5))
good_bar = plt.bar(x=cutoff['bins'].astype(
str)[::-1], height=cutoff['区间好占比'][::-1], align='edge', width=-0.3)
bad_bar = plt.bar(x=cutoff['bins'].astype(
str)[::-1], height=cutoff['区间坏占比'][::-1], align='edge', width=0.3)
plt.xlabel('Proba Bins')
plt.ylabel('Ratio')
plt.title('Distribution By Labels', fontsize=20)
plt.legend(handles=[good_bar, bad_bar], labels=[
"rate_good", "rate_bad"], loc="upper right", fontsize=14)
plt.xticks(rotation=60)
plt.show()
return cutoff, ddf
tmp = X_merged.copy()
tmp['target'] = y_merged.values.copy()
tmp['proba'] = model.predict_proba(X_merged)[:,1] # 取预测为1的概率
tmp['bins'] = pd.qcut(tmp['proba'], q=10)
cutoff, ddf = ww_get_cutoff(tmp, 'target', 'bins')
ddf
| bins | good | bad | ALL | rate_good | rate_bad | |
|---|---|---|---|---|---|---|
| 0 | (0.779, 0.906] | 105 | 175 | 280 | 0.375000 | 0.625000 |
| 1 | (0.691, 0.779] | 137 | 142 | 279 | 0.491039 | 0.508961 |
| 2 | (0.591, 0.691] | 159 | 120 | 279 | 0.569892 | 0.430108 |
| 3 | (0.485, 0.591] | 202 | 77 | 279 | 0.724014 | 0.275986 |
| 4 | (0.387, 0.485] | 233 | 46 | 279 | 0.835125 | 0.164875 |
| 5 | (0.306, 0.387] | 237 | 42 | 279 | 0.849462 | 0.150538 |
| 6 | (0.235, 0.306] | 257 | 22 | 279 | 0.921147 | 0.078853 |
| 7 | (0.156, 0.235] | 269 | 10 | 279 | 0.964158 | 0.035842 |
| 8 | (0.0964, 0.156] | 272 | 3 | 275 | 0.989091 | 0.010909 |
| 9 | (0.0209, 0.0964] | 284 | 0 | 284 | 1.000000 | 0.000000 |
cutoff
| bins | 区间好 | 区间坏 | 区间总人数 | 区间好占比 | 区间坏占比 | 区间人数占比 | 累计总人数 | 累计人数占比 | 累计好占比 | 累计坏占比(recall) | 当前区间坏占比(precision) | KS | 随机 | 提升 | 总体坏比例下降到 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | (0.779, 0.906] | 105 | 175 | 280 | 0.375000 | 0.625000 | 0.100287 | 280 | 0.100287 | 0.048724 | 0.274725 | 0.625000 | 0.226001 | 0.228152 | 1.739403 | 0.165473 |
| 1 | (0.691, 0.779] | 137 | 142 | 279 | 0.491039 | 0.508961 | 0.099928 | 559 | 0.200215 | 0.112297 | 0.497645 | 0.567084 | 0.385348 | 0.228152 | 1.485555 | 0.114613 |
| 2 | (0.591, 0.691] | 159 | 120 | 279 | 0.569892 | 0.430108 | 0.099928 | 838 | 0.300143 | 0.186079 | 0.686028 | 0.521480 | 0.499949 | 0.228152 | 1.285669 | 0.071633 |
| 3 | (0.485, 0.591] | 202 | 77 | 279 | 0.724014 | 0.275986 | 0.099928 | 1117 | 0.400072 | 0.279814 | 0.806907 | 0.460161 | 0.527093 | 0.228152 | 1.016907 | 0.044054 |
| 4 | (0.387, 0.485] | 233 | 46 | 279 | 0.835125 | 0.164875 | 0.099928 | 1396 | 0.500000 | 0.387935 | 0.879121 | 0.401146 | 0.491186 | 0.228152 | 0.758242 | 0.027579 |
| 5 | (0.306, 0.387] | 237 | 42 | 279 | 0.849462 | 0.150538 | 0.099928 | 1675 | 0.599928 | 0.497912 | 0.945055 | 0.359403 | 0.447143 | 0.228152 | 0.575280 | 0.012536 |
| 6 | (0.235, 0.306] | 257 | 22 | 279 | 0.921147 | 0.078853 | 0.099928 | 1954 | 0.699857 | 0.617169 | 0.979592 | 0.319345 | 0.362422 | 0.228152 | 0.399703 | 0.004656 |
| 7 | (0.156, 0.235] | 269 | 10 | 279 | 0.964158 | 0.035842 | 0.099928 | 2233 | 0.799785 | 0.741995 | 0.995290 | 0.283923 | 0.253295 | 0.228152 | 0.244447 | 0.001074 |
| 8 | (0.0964, 0.156] | 272 | 3 | 275 | 0.989091 | 0.010909 | 0.098496 | 2508 | 0.898281 | 0.868213 | 1.000000 | 0.253987 | 0.131787 | 0.228152 | 0.113238 | 0.000000 |
| 9 | (0.0209, 0.0964] | 284 | 0 | 284 | 1.000000 | 0.000000 | 0.101719 | 2792 | 1.000000 | 1.000000 | 1.000000 | 0.228152 | 0.000000 | 0.228152 | 0.000000 | 0.000000 |
plt.figure(figsize=(10, 7))
tmp = pd.concat([data_df, testSet], axis=0) # 注意,这里用了所有数据
for year in [2018, 2019, 2020]:
y_test_prob = model.predict_proba(tmp.loc[tmp['year']==year, selected_features])
sns.distplot(y_test_prob[:, 1], label=year)
plt.legend()
plt.xlim((0, 1))
plt.xlabel('model_score')
plt.show()
all_probs={
'good': model.predict_proba(
testSet.loc[testSet['target'] == 0, selected_features])[:, 1],
'bad': model.predict_proba(
testSet.loc[testSet['target'] == 1, selected_features])[:, 1],
}
pf.draw_score_distribution(all_probs, split_score=0.3, xlim=(0, 1))
None
('mean0 - mean1 = ', -0.29807889605519566)
0.3分以下的坏用户占坏用户总数的: 5.181%
0.3分以下的好用户占好用户总数的: 49.095%
0.3分以下好坏客户比= 32.061
0.3分以上的坏用户占坏用户总数的: 94.819%
0.3分以上的好用户占好用户总数的: 50.905%
0.3分以上好坏客户比= 1.816
tmp = pd.concat([data_df, testSet], axis=0)
for year in [2018, 2019, 2020]:
y_test_prob = tmp.loc[tmp['year']==year, 'next_year_ipd']
sns.distplot(y_test_prob, label=year)
plt.legend()
plt.show()
def score_func(x):
return x
targetV = model.predict_proba(X_validation_selected)
targetT = model.predict_proba(X_training_selected)
all_probs = {
'Validation': score_func(targetV[:, 1]),
'Traning': score_func(targetT[:, 1])
}
PSI = pf.get_PSI_score(all_probs, is_return=True)
PSI score: 0.020767
targetV = model.predict_proba(X_validation_selected)
targetT = model.predict_proba(X_testing[selected_features])
all_probs = {
'Validation': score_func(targetV[:, 1]),
'Testing': score_func(targetT[:, 1])
}
PSI = pf.get_PSI_score(all_probs, is_return=True)
PSI score: 0.025071
y_test_prob = model.predict_proba(X_testing[selected_features])
ksdf = pf.draw_KS_line(y_test_prob[:, 0], y_testing, n=20, # 用0列
fig_save=False, is_return=True,
mode='plt')
# 保存模型与对应features的importance
# date = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()+8*3600))
# message = f'城投中债隐含评级>=6_20220929'
# joblib.dump(model, f'./20220929任务结果/{date}_GBDT_{message}.pkl')
# model_feature_importance.to_csv(
# f'./20220929任务结果/{date}_feature_importance_{message}.csv',
# index=False
# )
# print(f'./pkls/{date}_GBDT_{message}.pkl')
# print(model.feature_names_)
def bin_split(data, col, target, cut_num, hue=None, is_discrete=False):
'''单变量等频分箱分析
PS: 最后一箱的分割点为左闭右闭区间
Parameters
----------
data : dataFrame
需要单变量分析的数据表
col : string
需要进行单变量分析的特征名字
target : string
计算坏客户占比所需要的目标特征名字,1是坏;0是好
cut_num : int
特征等频分箱的分箱数量
hue : string
需要进行分组的特征名
is_discrete : bool
col特征是否为离散型变量
Return
----------
return groupby_df
返回分组分析的结果表
'''
def intervel_to_str(a):
if a.left < 0:
left_num = str(int(round(a.left, 0)))
elif a.left > 10:
left_num = str(int(round(a.left, 0)))
else:
left_num = str(a.left)
if a.right > 10:
right_num = str(int(round(a.right, 0)))
else:
right_num = str(a.right)
return f'[{left_num},{right_num})'
values = data[col]
cut_df = pd.DataFrame()
if ((values.dtype == int) or
(values.dtype == float)) and (is_discrete is False):
tmp_split = values[values >= 0].tolist()
tmp_split = [0] if tmp_split == [] else tmp_split
cut_list = list(
set([
-9999999,
min(values), 0,
max(tmp_split) + 1e-9,
min(tmp_split)
]))
for i in range(1, cut_num):
cut_list = list(
set(cut_list + [np.percentile(tmp_split, 100 / cut_num * i)]))
cut_list.sort()
cut_df['cut'] = pd.cut(
values, cut_list, include_lowest=True, right=False)
else:
cut_df['cut'] = data[col]
cut_df[target] = data[target]
if hue is not None:
cut_df['hue'] = data[hue]
else:
cut_df['hue'] = -999
hues = cut_df['hue'].unique().tolist()
if len(hues) > 1:
groupby_dfs = []
for h in hues:
groupby_df = pd.DataFrame()
groups = cut_df[cut_df['hue'] == h].groupby('cut')
groupby_df['count'] = groups.count()[target]
groupby_df['sum'] = groups.sum()[target]
groupby_df['bad_rate'] = groupby_df['sum'] / groupby_df['count']
groupby_df['type'] = h
groupby_df[
'ratio'] = groupby_df['count'] / groupby_df['count'].sum()
groupby_dfs.append(groupby_df)
groupby_df = pd.concat(groupby_dfs, axis=0)
else:
groupby_df = pd.DataFrame()
groups = cut_df.groupby('cut')
groupby_df['count'] = groups.count()[target]
groupby_df['sum'] = groups.sum()[target]
groupby_df['bad_rate'] = groupby_df['sum'] / groupby_df['count']
groupby_df['ratio'] = groupby_df['count'] / groupby_df['count'].sum()
groupby_df = groupby_df.reset_index()
if is_discrete is False:
groupby_df['cut'] = groupby_df['cut'].apply(intervel_to_str)
groupby_df = groupby_df[groupby_df['ratio'] > 0]
groupby_df['cut'] = groupby_df['cut'].tolist()
plt.figure(figsize=(24, 8))
plt.subplot(1, 2, 1)
plt.title(
f'bad rate of {col}', fontdict={'size': 20}, fontproperties=myfont)
if len(hues) > 1:
ax = sns.barplot(
x='cut',
y='bad_rate',
data=groupby_df,
hue='type',
saturation=0.5,
ci=0.2,
capsize=5)
plt.setp(ax.get_legend().get_texts(), fontsize='15')
plt.setp(ax.get_legend().get_title(), fontsize='15')
else:
ax = sns.barplot(
x='cut',
y='bad_rate',
data=groupby_df,
saturation=0.5,
ci=0.2,
capsize=5)
plt.setp(ax.get_xaxis().get_label(), fontsize='20')
plt.setp(ax.get_yaxis().get_label(), fontsize='20')
plt.setp(ax.get_xaxis().get_ticklabels(), fontsize='15')
plt.setp(ax.get_yaxis().get_ticklabels(), fontsize='15')
plt.ylim((max(groupby_df['bad_rate'].min() - 1e-5, 0),
groupby_df['bad_rate'].max() + 1e-5))
plt.xticks(rotation=90, fontproperties=myfont)
plt.subplot(1, 2, 2)
plt.title(f'ratio of {col}', fontdict={'size': 20}, fontproperties=myfont)
if len(hues) > 1:
ax = sns.barplot(
x='cut',
y='ratio',
data=groupby_df,
hue='type',
saturation=0.5,
ci=0.2)
plt.setp(ax.get_legend().get_texts(), fontsize='15')
plt.setp(ax.get_legend().get_title(), fontsize='15')
else:
ax = sns.barplot(
x='cut', y='ratio', data=groupby_df, saturation=0.5, ci=0.2)
plt.setp(ax.get_xaxis().get_label(), fontsize='20')
plt.setp(ax.get_yaxis().get_label(), fontsize='20')
plt.setp(ax.get_xaxis().get_ticklabels(), fontsize='15')
plt.setp(ax.get_yaxis().get_ticklabels(), fontsize='15')
plt.xticks(rotation=90, fontproperties=myfont)
return groupby_df
tmp_df1 = X_training.copy()
tmp_df1['target'] = y_training
tmp_df1['dataset_type'] = 'train_val'
tmp_df2 = X_validation.copy()
tmp_df2['target'] = y_validation
tmp_df2['dataset_type'] = 'train_val'
tmp_df3 = X_testing.copy()
tmp_df3['target'] = y_testing
tmp_df3['dataset_type'] = 'test'
single_analysis_df = pd.concat([tmp_df2, tmp_df3], axis=0)
for i, col in tn(enumerate(model_feature_importance['features']),
total=model_feature_importance.shape[0]):
# for i, col in tn(enumerate(single_analysis_df.columns[1:-2]),
# total=len(single_analysis_df.columns[1:-2])):
print(i+1, col)
if len(single_analysis_df[col].unique()) <= 7:
bin_df = bin_split(data=single_analysis_df, col=col, target='target',
cut_num=4, hue='dataset_type', is_discrete=True)
else:
bin_df = bin_split(data=single_analysis_df, col=col, target='target',
cut_num=4, hue='dataset_type', is_discrete=False)
# bin_df['is_-9999999'] = bin_df['cut'].map(lambda x: 1 if '-9999999' in str(x) else 0)
# num_ok = 0
# for idx, t in enumerate(bin_df['type'].unique()):
# bbb = bin_df[(bin_df['type'] == t) & ((bin_df['is_-9999999'] == 0))]
# bbb_ratio = bbb['bad_rate'].tolist()
# if ((bbb_ratio == sorted(bbb_ratio)) or
# (bbb_ratio == sorted(bbb_ratio, reverse=True))
# ):
# num_ok += 1
# if num_ok == idx + 1:
# save_path = './savefig/单调/'
# else:
# save_path = './savefig/非单调/'
# save_path = './20220929任务结果/savefig/'
# if os.path.isdir(save_path) is False:
# os.makedirs(save_path)
#plt.savefig(f"{save_path}{i+1}|{col.split('_')[1].replace('/', '|')}|{col.split('_')[0]}.png", bbox_inches='tight')
0%| | 0/30 [00:00<?, ?it/s]
1 成交_成交土地均价(元/㎡) 2 资产规模数据_资产总计 3 区域财政_政府负债率(%) 4 财报衍生_净资产(即所有者权益) 5 财报衍生_其他应收款/总资产 6 区域财政_财政自给率(%) 7 区域财政_地方政府债务率(%) 8 财报衍生_固定资产 9 区域贷款_区域银行贷款余额(亿元) 10 区域利差_1年期不限评级 11 区域财政_一般预算收入占比(%) 12 财报衍生_其他应付款/流动负债 13 财报衍生_净利润 14 财报衍生_流动资产 15 偿债能力_现金比率 16 区域财政_基金收入占比(%) 17 区域经济_生产总值(GDP)(元) 18 偿债能力_货币资金/短期债务 19 区域财政_存续城投债务规模/一般预算收入(%) 20 区域财政_财政收入(元) 21 偿债能力_有形净值债务率 22 成长能力_归属母公司股东的权益(相对年初增长率) 23 财报衍生_其他应付款/流动资产 24 区域财政_一般预算收入增长 25 每股指标_每股现金流量净额 26 财报衍生_无形资产 27 财报衍生_存货周转天数 28 成长能力_净利润复合年增长率 29 成长能力_营业利润(N年,增长率) 30 区域经济_生产总值同比增长(%)
# 模型FI
tmp = pd.concat([data_df, testSet], axis=0)
iv_df, woe_df = pf.iv_woe(tmp[['target'] + selected_features], 'target')
vif = pf.check_vif(tmp[selected_features])
KSs = []
AUCs = []
imp_df = model_feature_importance.copy() # 注意,这里把最终模型的feature_importance拷贝过来了
# 对模型用到的特征,依次计算 AUC 和 KS
for col in tn(imp_df['features']):
feature_value = testSet[col]
auc = roc_auc_score(testSet['target'], feature_value)
fpr, tpr, _ = roc_curve(testSet['target'], feature_value)
AUCs.append(1 - auc if auc < 0.5 else auc)
KSs.append(max(abs(fpr - tpr)))
imp_df['KS'] = KSs
imp_df['AUC'] = AUCs
imp_df = pd.merge(iv_df, imp_df, on='features')
imp_df = pd.merge(imp_df, vif, on='features')
imp_df = imp_df.sort_values('importance', ascending=False)
imp_df
iv_woe: 0%| | 0/30 [00:00<?, ?it/s]
vif: 0%| | 0/30 [00:00<?, ?it/s]
123671859.64957677
0%| | 0/30 [00:00<?, ?it/s]
| features | iv | importance | KS | AUC | VIF | |
|---|---|---|---|---|---|---|
| 0 | 成交_成交土地均价(元/㎡) | 1.015336 | 0.184342 | 0.442876 | 0.765070 | 1.048077e+00 |
| 8 | 资产规模数据_资产总计 | 0.440099 | 0.175929 | 0.241275 | 0.643716 | 1.944197e+01 |
| 10 | 区域财政_政府负债率(%) | 0.376737 | 0.077259 | 0.352890 | 0.701944 | 1.236717e+08 |
| 11 | 财报衍生_净资产(即所有者权益) | 0.298295 | 0.061899 | 0.206440 | 0.609599 | 9.278368e+00 |
| 19 | 财报衍生_其他应收款/总资产 | 0.092960 | 0.048705 | 0.132413 | 0.571922 | 3.470148e+01 |
| 1 | 区域财政_财政自给率(%) | 0.935645 | 0.047808 | 0.399876 | 0.759827 | 2.124382e+00 |
| 12 | 区域财政_地方政府债务率(%) | 0.272864 | 0.035382 | 0.319231 | 0.687272 | 1.236719e+08 |
| 15 | 财报衍生_固定资产 | 0.225307 | 0.031424 | 0.169627 | 0.590696 | 3.136555e+00 |
| 2 | 区域贷款_区域银行贷款余额(亿元) | 0.661162 | 0.029288 | 0.377001 | 0.721983 | 1.041962e+00 |
| 5 | 区域利差_1年期不限评级 | 0.588315 | 0.028756 | 0.230495 | 0.520084 | 1.170056e+00 |
| 7 | 区域财政_一般预算收入占比(%) | 0.482476 | 0.028568 | 0.334062 | 0.705093 | 1.556958e+05 |
| 13 | 财报衍生_其他应付款/流动负债 | 0.226599 | 0.025955 | 0.188717 | 0.612887 | 7.567243e+01 |
| 16 | 财报衍生_净利润 | 0.219595 | 0.022245 | 0.160474 | 0.568054 | 1.417038e+00 |
| 14 | 财报衍生_流动资产 | 0.226576 | 0.022165 | 0.174274 | 0.592598 | 3.913026e+00 |
| 18 | 偿债能力_现金比率 | 0.110320 | 0.020235 | 0.200740 | 0.619950 | 1.210540e+07 |
| 9 | 区域财政_基金收入占比(%) | 0.431765 | 0.017771 | 0.309033 | 0.687950 | 1.072113e+00 |
| 3 | 区域经济_生产总值(GDP)(元) | 0.610885 | 0.016144 | 0.355763 | 0.720621 | 1.979093e+01 |
| 26 | 偿债能力_货币资金/短期债务 | 0.064087 | 0.015777 | 0.147355 | 0.574601 | 7.016615e+00 |
| 28 | 区域财政_存续城投债务规模/一般预算收入(%) | 0.024625 | 0.015196 | 0.086347 | 0.502860 | 1.557140e+05 |
| 4 | 区域财政_财政收入(元) | 0.589591 | 0.014360 | 0.352329 | 0.712676 | 1.950872e+01 |
| 17 | 偿债能力_有形净值债务率 | 0.147026 | 0.013088 | 0.144189 | 0.588595 | 1.210582e+07 |
| 22 | 成长能力_归属母公司股东的权益(相对年初增长率) | 0.080384 | 0.011857 | 0.149205 | 0.563867 | 7.979769e+01 |
| 29 | 财报衍生_其他应付款/流动资产 | 0.018235 | 0.010449 | 0.094744 | 0.546551 | 5.296924e+01 |
| 6 | 区域财政_一般预算收入增长 | 0.503313 | 0.009763 | 0.303605 | 0.647989 | 1.025558e+00 |
| 25 | 每股指标_每股现金流量净额 | 0.071257 | 0.008619 | 0.119301 | 0.551559 | 6.866977e+01 |
| 21 | 财报衍生_无形资产 | 0.089704 | 0.007537 | 0.111493 | 0.555340 | 1.771541e+00 |
| 24 | 财报衍生_存货周转天数 | 0.077135 | 0.005747 | 0.142353 | 0.569929 | 1.032172e+00 |
| 23 | 成长能力_净利润复合年增长率 | 0.077780 | 0.005468 | 0.115107 | 0.521688 | 3.605544e+02 |
| 27 | 成长能力_营业利润(N年,增长率) | 0.037175 | 0.004922 | 0.059886 | 0.506743 | 3.591344e+02 |
| 20 | 区域经济_生产总值同比增长(%) | 0.089911 | 0.003342 | 0.110915 | 0.541977 | 1.024103e+00 |